{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "%%capture\n",
    "%pip install datasets[audio]==1.16.1 umap-learn==0.5.1 datasets[s3] transformers[tf,torch,sentencepiece,vision,optuna,sklearn,onnxruntime]==4.11.3"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#hide\n",
    "from utils import *\n",
    "setup_chapter()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Text Classification"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## The Dataset"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### A First Look at Hugging Face Datasets"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from datasets import list_datasets\n",
    "\n",
    "all_datasets = list_datasets()\n",
    "print(f\"There are {len(all_datasets)} datasets currently available on the Hub\")\n",
    "print(f\"The first 10 are: {all_datasets[:10]}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# hide_output\n",
    "from datasets import load_dataset\n",
    "\n",
    "emotions = load_dataset(\"emotion\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "emotions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_ds = emotions[\"train\"]\n",
    "train_ds"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "len(train_ds)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_ds[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_ds.column_names"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(train_ds.features)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(train_ds[:5])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(train_ds[\"text\"][:5])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### From Datasets to DataFrames"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "\n",
    "emotions.set_format(type=\"pandas\")\n",
    "df = emotions[\"train\"][:]\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def label_int2str(row):\n",
    "    return emotions[\"train\"].features[\"label\"].int2str(row)\n",
    "\n",
    "df[\"label_name\"] = df[\"label\"].apply(label_int2str)\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Looking at the Class Distribution"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import matplotlib.pyplot as plt\n",
    "\n",
    "df[\"label_name\"].value_counts(ascending=True).plot.barh()\n",
    "plt.title(\"Frequency of Classes\")\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### How Long Are Our Tweets?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df[\"Words Per Tweet\"] = df[\"text\"].str.split().apply(len)\n",
    "df.boxplot(\"Words Per Tweet\", by=\"label_name\", grid=False, showfliers=False,\n",
    "           color=\"black\")\n",
    "plt.suptitle(\"\")\n",
    "plt.xlabel(\"\")\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "emotions.reset_format()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## From Text to Tokens"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Character Tokenization"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "text = \"Tokenizing text is a core task of NLP.\"\n",
    "tokenized_text = list(text)\n",
    "print(tokenized_text)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "token2idx = {ch: idx for idx, ch in enumerate(sorted(set(tokenized_text)))}\n",
    "print(token2idx)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "input_ids = [token2idx[token] for token in tokenized_text]\n",
    "print(input_ids)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "categorical_df = pd.DataFrame(\n",
    "    {\"Name\": [\"Bumblebee\", \"Optimus Prime\", \"Megatron\"], \"Label ID\": [0,1,2]})\n",
    "categorical_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "pd.get_dummies(categorical_df[\"Name\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import torch\n",
    "import torch.nn.functional as F\n",
    "\n",
    "input_ids = torch.tensor(input_ids)\n",
    "one_hot_encodings = F.one_hot(input_ids, num_classes=len(token2idx))\n",
    "one_hot_encodings.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(f\"Token: {tokenized_text[0]}\")\n",
    "print(f\"Tensor index: {input_ids[0]}\")\n",
    "print(f\"One-hot: {one_hot_encodings[0]}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Word Tokenization"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "tokenized_text = text.split()\n",
    "print(tokenized_text)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Subword Tokenization"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# hide_output\n",
    "from transformers import AutoTokenizer\n",
    "\n",
    "model_ckpt = \"distilbert-base-uncased\"\n",
    "tokenizer = AutoTokenizer.from_pretrained(model_ckpt)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "encoded_text = tokenizer(text)\n",
    "print(encoded_text)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "tokens = tokenizer.convert_ids_to_tokens(encoded_text.input_ids)\n",
    "print(tokens)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(tokenizer.convert_tokens_to_string(tokens))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "tokenizer.vocab_size"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "tokenizer.model_max_length"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "tokenizer.model_input_names"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Tokenizing the Whole Dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def tokenize(batch):\n",
    "    return tokenizer(batch[\"text\"], padding=True, truncation=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(tokenize(emotions[\"train\"][:2]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#hide_input\n",
    "tokens2ids = list(zip(tokenizer.all_special_tokens, tokenizer.all_special_ids))\n",
    "data = sorted(tokens2ids, key=lambda x : x[-1])\n",
    "df = pd.DataFrame(data, columns=[\"Special Token\", \"Special Token ID\"])\n",
    "df.T"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# hide_output\n",
    "emotions_encoded = emotions.map(tokenize, batched=True, batch_size=None)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(emotions_encoded[\"train\"].column_names)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Training a Text Classifier"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Transformers as Feature Extractors"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Using pretrained models"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# hide_output\n",
    "from transformers import AutoModel\n",
    "\n",
    "model_ckpt = \"distilbert-base-uncased\"\n",
    "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
    "model = AutoModel.from_pretrained(model_ckpt).to(device)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Extracting the last hidden states"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "text = \"this is a test\"\n",
    "inputs = tokenizer(text, return_tensors=\"pt\")\n",
    "print(f\"Input tensor shape: {inputs['input_ids'].size()}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "inputs = {k:v.to(device) for k,v in inputs.items()}\n",
    "with torch.no_grad():\n",
    "    outputs = model(**inputs)\n",
    "print(outputs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "outputs.last_hidden_state.size()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "outputs.last_hidden_state[:,0].size()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def extract_hidden_states(batch):\n",
    "    # Place model inputs on the GPU\n",
    "    inputs = {k:v.to(device) for k,v in batch.items() \n",
    "              if k in tokenizer.model_input_names}\n",
    "    # Extract last hidden states\n",
    "    with torch.no_grad():\n",
    "        last_hidden_state = model(**inputs).last_hidden_state\n",
    "    # Return vector for [CLS] token\n",
    "    return {\"hidden_state\": last_hidden_state[:,0].cpu().numpy()}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "emotions_encoded.set_format(\"torch\", \n",
    "                            columns=[\"input_ids\", \"attention_mask\", \"label\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#hide_output\n",
    "emotions_hidden = emotions_encoded.map(extract_hidden_states, batched=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "emotions_hidden[\"train\"].column_names"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Creating a feature matrix"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "\n",
    "X_train = np.array(emotions_hidden[\"train\"][\"hidden_state\"])\n",
    "X_valid = np.array(emotions_hidden[\"validation\"][\"hidden_state\"])\n",
    "y_train = np.array(emotions_hidden[\"train\"][\"label\"])\n",
    "y_valid = np.array(emotions_hidden[\"validation\"][\"label\"])\n",
    "X_train.shape, X_valid.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Visualizing the training set"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from umap import UMAP\n",
    "from sklearn.preprocessing import MinMaxScaler\n",
    "\n",
    "# Scale features to [0,1] range\n",
    "X_scaled = MinMaxScaler().fit_transform(X_train)\n",
    "# Initialize and fit UMAP\n",
    "mapper = UMAP(n_components=2, metric=\"cosine\").fit(X_scaled)\n",
    "# Create a DataFrame of 2D embeddings\n",
    "df_emb = pd.DataFrame(mapper.embedding_, columns=[\"X\", \"Y\"])\n",
    "df_emb[\"label\"] = y_train\n",
    "df_emb.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "fig, axes = plt.subplots(2, 3, figsize=(7,5))\n",
    "axes = axes.flatten()\n",
    "cmaps = [\"Greys\", \"Blues\", \"Oranges\", \"Reds\", \"Purples\", \"Greens\"]\n",
    "labels = emotions[\"train\"].features[\"label\"].names\n",
    "\n",
    "for i, (label, cmap) in enumerate(zip(labels, cmaps)):\n",
    "    df_emb_sub = df_emb.query(f\"label == {i}\")\n",
    "    axes[i].hexbin(df_emb_sub[\"X\"], df_emb_sub[\"Y\"], cmap=cmap,\n",
    "                   gridsize=20, linewidths=(0,))\n",
    "    axes[i].set_title(label)\n",
    "    axes[i].set_xticks([]), axes[i].set_yticks([])\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Training a simple classifier\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#hide_output\n",
    "# We increase `max_iter` to guarantee convergence \n",
    "from sklearn.linear_model import LogisticRegression\n",
    "\n",
    "lr_clf = LogisticRegression(max_iter=3000)\n",
    "lr_clf.fit(X_train, y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "lr_clf.score(X_valid, y_valid)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.dummy import DummyClassifier\n",
    "\n",
    "dummy_clf = DummyClassifier(strategy=\"most_frequent\")\n",
    "dummy_clf.fit(X_train, y_train)\n",
    "dummy_clf.score(X_valid, y_valid)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix\n",
    "\n",
    "def plot_confusion_matrix(y_preds, y_true, labels):\n",
    "    cm = confusion_matrix(y_true, y_preds, normalize=\"true\")\n",
    "    fig, ax = plt.subplots(figsize=(6, 6))\n",
    "    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)\n",
    "    disp.plot(cmap=\"Blues\", values_format=\".2f\", ax=ax, colorbar=False)\n",
    "    plt.title(\"Normalized confusion matrix\")\n",
    "    plt.show()\n",
    "    \n",
    "y_preds = lr_clf.predict(X_valid)\n",
    "plot_confusion_matrix(y_preds, y_valid, labels)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Fine-Tuning Transformers"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Loading a pretrained model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# hide_output\n",
    "from transformers import AutoModelForSequenceClassification\n",
    "\n",
    "num_labels = 6\n",
    "model = (AutoModelForSequenceClassification\n",
    "         .from_pretrained(model_ckpt, num_labels=num_labels)\n",
    "         .to(device))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Defining the performance metrics"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.metrics import accuracy_score, f1_score\n",
    "\n",
    "def compute_metrics(pred):\n",
    "    labels = pred.label_ids\n",
    "    preds = pred.predictions.argmax(-1)\n",
    "    f1 = f1_score(labels, preds, average=\"weighted\")\n",
    "    acc = accuracy_score(labels, preds)\n",
    "    return {\"accuracy\": acc, \"f1\": f1}"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Training the model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from huggingface_hub import notebook_login\n",
    "\n",
    "notebook_login()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from huggingface_hub import HfFolder\n",
    "\n",
    "username  = 'simonmesserli' #replace with your own username from hugging face.\n",
    "hub_token = HfFolder.get_token()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Training with SageMaker"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import sagemaker.huggingface\n",
    "import sagemaker\n",
    "\n",
    "sess = sagemaker.Session()\n",
    "# sagemaker session bucket -> used for uploading data, models and logs\n",
    "# sagemaker will automatically create this bucket if it not exists\n",
    "sagemaker_session_bucket=None\n",
    "if sagemaker_session_bucket is None and sess is not None:\n",
    "    # set to default bucket if a bucket name is not given\n",
    "    sagemaker_session_bucket = sess.default_bucket()\n",
    "\n",
    "role = sagemaker.get_execution_role()\n",
    "sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)\n",
    "\n",
    "print(f\"sagemaker role arn: {role}\")\n",
    "print(f\"sagemaker bucket: {sess.default_bucket()}\")\n",
    "print(f\"sagemaker session region: {sess.boto_region_name}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import botocore\n",
    "from datasets.filesystems import S3FileSystem\n",
    "\n",
    "s3 = S3FileSystem()\n",
    "\n",
    "s3_prefix = 'samples/datasets/02_classification'\n",
    "\n",
    "train_dataset=emotions_encoded[\"train\"]\n",
    "eval_dataset=emotions_encoded[\"validation\"]\n",
    "\n",
    "# save train_dataset to s3\n",
    "training_input_path = f's3://{sess.default_bucket()}/{s3_prefix}/train'\n",
    "train_dataset.save_to_disk(training_input_path, fs=s3)\n",
    "\n",
    "# save eval_dataset to s3\n",
    "eval_input_path = f's3://{sess.default_bucket()}/{s3_prefix}/validation'\n",
    "eval_dataset.save_to_disk(eval_input_path, fs=s3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "!pygmentize ./scripts/02_classification_train.py"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sagemaker.huggingface import HuggingFace\n",
    "import time\n",
    "\n",
    "batch_size = 64\n",
    "logging_steps = len(emotions_encoded[\"train\"]) // batch_size\n",
    "model_name = f\"{model_ckpt}-finetuned-emotion\"\n",
    "\n",
    "# hyperparameters, which are passed into the training job\n",
    "hyperparameters={'model_id':model_ckpt,\n",
    "                 'num_train_epochs':2,\n",
    "                 'learning_rate':2e-5,\n",
    "                 'per_device_train_batch_size':batch_size,\n",
    "                 'per_device_eval_batch_size':batch_size,\n",
    "                 'learning_rate':2e-5,\n",
    "                 'weight_decay':0.01,\n",
    "                 'evaluation_strategy':\"epoch\",\n",
    "                 'disable_tqdm':False,\n",
    "                 'logging_steps':logging_steps,\n",
    "                 'push_to_hub':True,\n",
    "                 'hub_model_id':username + '/' + model_name,\n",
    "                 'hub_strategy':\"every_save\",\n",
    "                 'hub_token':hub_token\n",
    "                }\n",
    "\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# define Training Job Name \n",
    "job_name = f'nlp-book-sagemaker-02classificaton-{time.strftime(\"%Y-%m-%d-%H-%M-%S\", time.localtime())}'\n",
    "\n",
    "# create the Estimator\n",
    "huggingface_estimator = HuggingFace(\n",
    "    entry_point          = '02_classification_train.py', # fine-tuning script used in training jon\n",
    "    source_dir           = './scripts',                  # directory where fine-tuning script is stored\n",
    "    instance_type        = 'ml.p3.2xlarge',              # instances type used for the training job\n",
    "    instance_count       = 1,                            # the number of instances used for training\n",
    "    base_job_name        = job_name,                     # the name of the training job\n",
    "    role                 = role,                         # IAM role used in training job to access AWS ressources, e.g. Amazon S3\n",
    "    transformers_version = '4.11',                       # the transformers version used in the training job\n",
    "    pytorch_version      = '1.9',                        # the pytorch_version version used in the training job\n",
    "    py_version           = 'py38',                       # the python version used in the training job\n",
    "    hyperparameters      = hyperparameters,              # the hyperparameter used for running the training job\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "# define a data input dictonary with our uploaded s3 uris\n",
    "data = {\n",
    "    'train': training_input_path,\n",
    "    'test': eval_input_path\n",
    "}\n",
    "\n",
    "# starting the train job with our uploaded datasets as input\n",
    "huggingface_estimator.fit(data, wait=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The logs can be found in Amazon CloudWatch: https://console.aws.amazon.com/cloudwatch/home#logsV2:log-groups/log-group/$252Faws$252Fsagemaker$252FTrainingJobs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# the model is saved in the S3 bucket and was also pushed to the hugging face hub.\n",
    "print(huggingface_estimator.model_data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import Trainer, AutoModel\n",
    "\n",
    "# we load the model from the hub to the trainer and do further analyses.\n",
    "\n",
    "model_finetuned = AutoModelForSequenceClassification.from_pretrained('simonmesserli' + '/' + model_name)\n",
    "\n",
    "trainer = Trainer(model = model_finetuned)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Deploy model with SageMaker Endpoint"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "predictor = huggingface_estimator.deploy(1,\"ml.g4dn.xlarge\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "custom_tweet = {\"inputs\" : \"I saw a movie today and it was really good.\"}\n",
    "predictor.predict(custom_tweet)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "After running your requests, make sure to delete your endpoint."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "predictor.delete_endpoint()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# hide_output\n",
    "preds_output = trainer.predict(emotions_encoded[\"validation\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "preds_output.metrics"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "y_preds = np.argmax(preds_output.predictions, axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "plot_confusion_matrix(y_preds, y_valid, labels)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Error analysis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from torch.nn.functional import cross_entropy\n",
    "\n",
    "def forward_pass_with_label(batch):\n",
    "    # Place all input tensors on the same device as the model\n",
    "    inputs = {k:v.to(device) for k,v in batch.items() \n",
    "              if k in tokenizer.model_input_names}\n",
    "\n",
    "    with torch.no_grad():\n",
    "        output = model(**inputs)\n",
    "        pred_label = torch.argmax(output.logits, axis=-1)\n",
    "        loss = cross_entropy(output.logits, batch[\"label\"].to(device), \n",
    "                             reduction=\"none\")\n",
    "\n",
    "    # Place outputs on CPU for compatibility with other dataset columns   \n",
    "    return {\"loss\": loss.cpu().numpy(), \n",
    "            \"predicted_label\": pred_label.cpu().numpy()}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#hide_output\n",
    "# Convert our dataset back to PyTorch tensors\n",
    "emotions_encoded.set_format(\"torch\", \n",
    "                            columns=[\"input_ids\", \"attention_mask\", \"label\"])\n",
    "# Compute loss values\n",
    "emotions_encoded[\"validation\"] = emotions_encoded[\"validation\"].map(\n",
    "    forward_pass_with_label, batched=True, batch_size=16)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "emotions_encoded.set_format(\"pandas\")\n",
    "cols = [\"text\", \"label\", \"predicted_label\", \"loss\"]\n",
    "df_test = emotions_encoded[\"validation\"][:][cols]\n",
    "df_test[\"label\"] = df_test[\"label\"].apply(label_int2str)\n",
    "df_test[\"predicted_label\"] = (df_test[\"predicted_label\"]\n",
    "                              .apply(label_int2str))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#hide_output\n",
    "df_test.sort_values(\"loss\", ascending=False).head(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#hide_output\n",
    "df_test.sort_values(\"loss\", ascending=True).head(10)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Saving and sharing the model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#hide_output\n",
    "from transformers import pipeline\n",
    "\n",
    "# Change `simonmesserli` to your Hub username\n",
    "model_id = \"simonmesserli/distilbert-base-uncased-finetuned-emotion\"\n",
    "classifier = pipeline(\"text-classification\", model=model_id)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "custom_tweet = \"I saw a movie today and it was really good.\"\n",
    "preds = classifier(custom_tweet, return_all_scores=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "preds_df = pd.DataFrame(preds[0])\n",
    "plt.bar(labels, 100 * preds_df[\"score\"], color='C0')\n",
    "plt.title(f'\"{custom_tweet}\"')\n",
    "plt.ylabel(\"Class probability (%)\")\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Conclusion"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3.9.13 64-bit",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.13"
  },
  "vscode": {
   "interpreter": {
    "hash": "aee8b7b246df8f9039afb4144a1f6fd8d2ca17a180786b69acc140d282b71a49"
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
